{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "文章链接"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://mp.weixin.qq.com/s?__biz=MzI0NTQ1NDc4Nw==&mid=2247485370&idx=1&sn=6453ae5970411dc7770addef73657b8e&chksm=e94f0516de388c001d978008b470fe4caf837b34c7a1469ed0048474a52826734b4212105401&scene=21#wechat_redirect"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一 、语料（Corpus）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.167 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['商业 新知 : 知识图谱 为 内核 , 构建 商业 创新 服务 完整 生态 。',\n",
       " '如何 更好 利用 知识图谱 技术 做 反 欺诈 ?   360 金融 首席 数据 科学家 沈赟 开讲 。',\n",
       " '知识 管理   |   基于 知识图谱 的 国际 知识 管理 领域 可视化 分析 。',\n",
       " '一文 详解 达观 数据 知识图谱 技术 与 应用 。',\n",
       " '知识图谱 技术 落地 金融 行业 的 关键 四步 。',\n",
       " '一文 读懂 知识图谱 的 商业 应用 进程 及 技术 背景 。',\n",
       " '海云 数据 CPO 王斌 : 打造 大 数据 可视 分析 与 AI 应用 的 高科技 企业 。',\n",
       " '智能 产业 | 《 人工智能 标准化 白皮书 2018 》 带来 创新 创业 新 技术标准 。',\n",
       " '国家语委 重大 科研项目 “ 中华 经典 诗词 知识图谱 构建 技术 研究 ” 开题 。',\n",
       " '最全 知识图谱 介绍 : 关键技术 、 开放 数据 集 、 应用 案例 汇总 。',\n",
       " '中译 语通 Jove   Mind 知识图谱 平台   引领 企业 智能化 发展 。',\n",
       " '知识图谱 : 知识图谱 赋能 企业 数字化 转型 ， 为 企业 升级 转型 注入 新 能量 。']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba\n",
    "\n",
    "\n",
    "jieba.add_word('知识图谱') #防止“知识图谱”被切错词\n",
    "\n",
    "\n",
    "raw_corpus = ['商业新知:知识图谱为内核,构建商业创新服务完整生态。',\n",
    "'如何更好利用知识图谱技术做反欺诈? 360金融首席数据科学家沈赟开讲。',\n",
    "'知识管理 | 基于知识图谱的国际知识管理领域可视化分析。',\n",
    "'一文详解达观数据知识图谱技术与应用。',\n",
    "'知识图谱技术落地金融行业的关键四步。',\n",
    "'一文读懂知识图谱的商业应用进程及技术背景。',\n",
    "'海云数据CPO王斌:打造大数据可视分析与AI应用的高科技企业。',\n",
    "'智能产业|《人工智能标准化白皮书2018》带来创新创业新技术标准。',\n",
    "'国家语委重大科研项目“中华经典诗词知识图谱构建技术研究”开题。',\n",
    "'最全知识图谱介绍:关键技术、开放数据集、应用案例汇总。',\n",
    "'中译语通Jove Mind知识图谱平台 引领企业智能化发展。',\n",
    "'知识图谱:知识图谱赋能企业数字化转型，为企业升级转型注入新能量。']\n",
    "\n",
    "\n",
    "raw_corpus = [' '.join(jieba.lcut(i)) for i in raw_corpus]  #对语料库中的文档进行分词，便于后续处理\n",
    "raw_corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['商业', '知识图谱', '构建', '商业', '创新'],\n",
       " ['知识图谱', '技术', '金融', '数据'],\n",
       " ['知识', '管理', '知识图谱', '知识', '管理', '分析'],\n",
       " ['一文', '数据', '知识图谱', '技术'],\n",
       " ['知识图谱', '技术', '金融'],\n",
       " ['一文', '知识图谱', '商业', '技术'],\n",
       " ['数据', '数据', '分析', '企业'],\n",
       " ['创新', '新'],\n",
       " ['知识图谱', '构建', '技术'],\n",
       " ['知识图谱', '数据'],\n",
       " ['知识图谱', '企业'],\n",
       " ['知识图谱', '知识图谱', '企业', '转型', '企业', '转型', '新']]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 移除常用词以及分词\n",
    "stoplist = [i.strip() for i in open('chineseStopWords.txt',encoding='utf-8').readlines()]\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "#将文档中可能存在的西文字符小写化，按空格进行拆分，且去停用词\n",
    "texts = [[word for word in document.lower().split() if word not in stoplist]\n",
    "         for document in raw_corpus]\n",
    "\n",
    "\n",
    "#计算词频\n",
    "from collections import defaultdict\n",
    "frequency = defaultdict(int)\n",
    "for text in texts:\n",
    "    for token in text:\n",
    "        frequency[token] += 1\n",
    "\n",
    "\n",
    "# 仅保留词频数高于1的词汇\n",
    "processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]\n",
    "processed_corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(14 unique tokens: ['创新', '商业', '构建', '知识图谱', '技术']...)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from gensim import corpora\n",
    "\n",
    "dictionary = corpora.Dictionary(processed_corpus)\n",
    "print(dictionary)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 二 、 向量空间（Vector Space）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'创新': 0, '商业': 1, '构建': 2, '知识图谱': 3, '技术': 4, '数据': 5, '金融': 6, '分析': 7, '知识': 8, '管理': 9, '一文': 10, '企业': 11, '新': 12, '转型': 13}\n"
     ]
    }
   ],
   "source": [
    "print(dictionary.token2id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3, 1), (11, 1), (13, 1)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "new_doc = \"知识图谱 为 企业 转型 助力\"  #已分词，便于后续处理\n",
    "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
    "new_vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[(0, 1), (1, 2), (2, 1), (3, 1)],\n",
       " [(3, 1), (4, 1), (5, 1), (6, 1)],\n",
       " [(3, 1), (7, 1), (8, 2), (9, 2)],\n",
       " [(3, 1), (4, 1), (5, 1), (10, 1)],\n",
       " [(3, 1), (4, 1), (6, 1)],\n",
       " [(1, 1), (3, 1), (4, 1), (10, 1)],\n",
       " [(5, 2), (7, 1), (11, 1)],\n",
       " [(0, 1), (12, 1)],\n",
       " [(2, 1), (3, 1), (4, 1)],\n",
       " [(3, 1), (5, 1)],\n",
       " [(3, 1), (11, 1)],\n",
       " [(3, 2), (11, 2), (12, 1), (13, 2)]]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
    "bow_corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 三 、 模型（Model）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3, 0.06112717038912965),\n",
       " (4, 0.29351946977430854),\n",
       " (11, 0.46478459877035777),\n",
       " (13, 0.8331176787522306)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "from gensim import models\n",
    "# 训练模型\n",
    "tfidf = models.TfidfModel(bow_corpus)\n",
    "# 对\"知识图谱这种技术是企业转型的利器\"进行转换\n",
    "tfidf[dictionary.doc2bow(\"知识图谱 这种 技术 是 企业 转型 的 利器\".split())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
